/*******************************************************************************
																				
	DESCRIPTION: 	This do file creates the datasets that are to be fed into R 
					to create the ensemble predictions for 2006 excluding spells 
					that included training.
					
*******************************************************************************/

clear all
global id_code 002

* Import 2006 data:
use "${data}/002_DataForR_Full_2006.dta", clear

* We manually re-create the sample split used in the main analysis:
local tuning_l = 0
local tuning_u = round(0.1 * `=_N')

local training_l = `tuning_u' + 1
local training_u = `training_l' + round(0.3 * _N)

gen samp = 1 if inrange(n_order, `tuning_l', `tuning_u')
replace samp = 2 if inrange(n_order, `training_l', `training_u')

replace samp = 3 if samp == .

* Label:
label define samp 1 "Tuning" 2 "Training" 3 "Rest"
label values samp samp

* Tab:
tab samp if !missing(emplAft6M_0M_In), missing

* Merge with info on training:
merge 1:1 Lop* InLnr using "${data}/005_UnemploymentCategoryStats", ///
	assert(2 3) keep(3) nogen ///
	keepusing(training_combined_6months)

* Save:
order LopNr_PersonNr InLnr n n_order inSample samp training_combined_6months
sort n_order
save "${data}/${id_code}_DataForR_Full_NoTraining_2006.dta", replace

* Merge with info on recalls:
drop training_combined_6months
merge 1:1 LopNr_PersonNr InLnr  using "${data}/001_1_UnemploymentSpells_Save", ///
	keepusing(AvOrs) assert(2 3) keep(3) nogen

gen recalled = AvOrs == "3 "
drop AvOrs

* Order vars: 
order LopNr_PersonNr InLnr n n_order inSample samp recalled
sort n_order

* Save:
save "${data}/${id_code}_DataForR_Full_NoRecalls_2006.dta", replace

